library(tidyverse)
library(plotly)
# list all task files:
pubmed_dir <- '../data/pubmed/tests/'
task_names <- dir(pubmed_dir, pattern = "*.csv")
task_names <- str_remove_all(task_names, ".csv")
D <- tibble()
for (task in task_names){
# print(paste0("--- task: ", task))
filename = paste0(pubmed_dir, task, '.csv')
TMP <- read.csv(filename) %>% mutate(year = as.numeric(year))
# TODO: check if data is complete (e.g., 'title' %in% names(TMP))
if ('year' %in% names(TMP)){
TMP <- TMP %>%
filter(!is.na(year), year < 2021) %>% # filtering out 2021 because year is incomplete
select(pmid, year) %>%
distinct() %>%
group_by(year) %>%
summarise(n = n(), .groups = "drop") %>%
ungroup() %>%
mutate(task = task)
D <- rbind(D, TMP)
} else {
print(paste0("--- missing year info: ", task))
}
}
rm(list = c("TMP"))
D <- D %>%
filter(task != "MONITOR", task != "STOP", year > 1500) %>%
group_by(task) %>%
arrange(task, year) %>%
mutate(N = cumsum(n)) %>%
ungroup()
# TODO: fill in empty years with 0 so all tasks have the same year range.
D %>%
filter(year == 2020) %>%
arrange(desc(N)) %>%
select(task, N) %>%
mutate(percentage = round(N/sum(N)*100, 1))
p <- ggplot(D, aes(year, n, color = task)) +
geom_line() +
theme(legend.position="none") +
labs(title = "Frequency of tasks",
subtitle = "",
caption = paste0("Pubmed data: N=", nrow(D)))
p2 <- p + geom_label(data = filter(D, year == 2020), aes(x=2022, y=n, label=task), hjust=0, nudge_x = 0.05) +
coord_cartesian(xlim = c(1940, 2030))
p2

ggplotly(p)
p <- ggplot(D, aes(year, N, color = task)) +
geom_line() +
theme(legend.position="none") +
labs(title = "Frequency of tasks",
subtitle = "",
caption = paste0("Pubmed data: N=", nrow(D)))
p2 <- p + geom_label(data = filter(D, year == 2020), aes(x=2022, y=N, label=task), hjust=0, nudge_x = 0.05) +
coord_cartesian(xlim = c(1940, 2030))
p2

ggplotly(p)
# Is task use frequency determined by how old a task is?
# plot total number of papers in 2021 as a function of year of first occurrence for each task.
First <- D %>% group_by(task) %>% top_n(-1) %>% arrange(year) %>% ungroup() %>% select(year, task) %>% rename(first_year = year)
Last <- D %>% group_by(task) %>% top_n(1) %>% ungroup() %>% arrange(year) %>% select(task, N)
DD <- First %>% left_join(Last)
pp <- ggplot(DD, aes(first_year, N, label = task, color = task, size = N)) + geom_point()
ggplotly(pp)
# it doesn't look like older tasks are necessarily more used;
# no recent task has been widely used. are new tasks less used than expected given rate on older task?
# what is the rate of new task production?
ggplot(First, aes(first_year)) +
geom_density() +
geom_rug(alpha = 0.5)

# what is the rate of new task production?
# taking into account the increasing rate of publications, this decrease in task development is further emphasized.
# i.e., new task per thousand papers.